Data Munging Practice

Practicing data wrangling and cleaning using various Python libraries to improve my data science practices and create some reusable code I can use for future projects.


In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Automating Data Extraction


In [22]:
import os
import urllib

root_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/"
full_url = root_url + "cylinder-bands/bands.data"
dataset_location = os.path.join("datasets","cylinder-bands")
filename = 'cylinder-bands.csv'

def fetch_data(filename, full_url=full_url, dataset_location=dataset_location):
    if not os.path.isdir(dataset_location):
        os.makedirs(dataset_location)
    file_path = os.path.join(dataset_location, filename)
    urllib.request.urlretrieve(full_url, file_path)

In [23]:
fetch_data("cylinder-bands.csv")

In [24]:
def load_data(filename, dataset_location=dataset_location):
    csv_path = os.path.join(dataset_location, filename)
    return pd.read_csv(csv_path, header=None)

In [25]:
cylinder_bands = load_data("cylinder-bands.csv")
cylinder_bands.head()


Out[25]:
0 1 2 3 4 5 6 7 8 9 ... 30 31 32 33 34 35 36 37 38 39
0 19910108 X126 TVGUIDE 25503 YES KEY YES BENTON GALLATIN UNCOATED ... 36.4 0 0 2.5 1 34 40 105 100 band
1 19910109 X266 TVGUIDE 25503 YES KEY YES BENTON GALLATIN UNCOATED ... 38.5 0 0 2.5 0.7 34 40 105 100 noband
2 19910104 B7 MODMAT 47201 YES KEY YES BENTON GALLATIN UNCOATED ... 39.8 0 0 2.8 0.9 40 40 103.87 100 noband
3 19910104 T133 MASSEY 39039 YES KEY YES BENTON GALLATIN UNCOATED ... 38.8 0 0 2.5 1.3 40 40 108.06 100 noband
4 19910111 J34 KMART 37351 NO KEY YES BENTON GALLATIN UNCOATED ... 42.5 5 0 2.3 0.6 35 40 106.67 100 noband

5 rows × 40 columns


In [26]:
attributes = [
    'timestamp',
    'cylinder number',
    'customer',
    'job number',
    'grain screened',
    'ink color',
    'proof on ctd ink',
    'blade mfg',
    'cylinder divisional',
    'paper type',
    'ink type',
    'direct stream',
    'solvent type',
    'type on cylinder',
    'press type',
    'press',
    'unit number',
    'cylinder size',
    'paper mill location',
    'plating tank',
    'proof cut',
    'viscosity',
    'caliper',
    'ink temperature',
    'humidity',
    'roughness',
    'blade pressure',
    'varnish pct',
    'press speed',
    'ink pct',
    'solvent pct',
    'ESA Voltage',
    'ESA Amperage',
    'wax',
    'hardener',
    'roller durometer',
    'current density',
    'anode space ratio',
    'chrome content',
    'band type',
]

In [27]:
len(attributes)


Out[27]:
40

In [28]:
cylinder_bands.columns = attributes

In [29]:
cylinder_bands.head()


Out[29]:
timestamp cylinder number customer job number grain screened ink color proof on ctd ink blade mfg cylinder divisional paper type ... solvent pct ESA Voltage ESA Amperage wax hardener roller durometer current density anode space ratio chrome content band type
0 19910108 X126 TVGUIDE 25503 YES KEY YES BENTON GALLATIN UNCOATED ... 36.4 0 0 2.5 1 34 40 105 100 band
1 19910109 X266 TVGUIDE 25503 YES KEY YES BENTON GALLATIN UNCOATED ... 38.5 0 0 2.5 0.7 34 40 105 100 noband
2 19910104 B7 MODMAT 47201 YES KEY YES BENTON GALLATIN UNCOATED ... 39.8 0 0 2.8 0.9 40 40 103.87 100 noband
3 19910104 T133 MASSEY 39039 YES KEY YES BENTON GALLATIN UNCOATED ... 38.8 0 0 2.5 1.3 40 40 108.06 100 noband
4 19910111 J34 KMART 37351 NO KEY YES BENTON GALLATIN UNCOATED ... 42.5 5 0 2.3 0.6 35 40 106.67 100 noband

5 rows × 40 columns

Reformatting Data Types


In [30]:
cylinder_bands.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 40 columns):
timestamp              541 non-null object
cylinder number        541 non-null object
customer               541 non-null object
job number             541 non-null object
grain screened         541 non-null object
ink color              541 non-null object
proof on ctd ink       541 non-null object
blade mfg              541 non-null object
cylinder divisional    541 non-null object
paper type             541 non-null object
ink type               541 non-null object
direct stream          541 non-null object
solvent type           541 non-null object
type on cylinder       541 non-null object
press type             541 non-null object
press                  541 non-null object
unit number            541 non-null float64
cylinder size          540 non-null object
paper mill location    540 non-null object
plating tank           540 non-null object
proof cut              540 non-null object
viscosity              540 non-null object
caliper                540 non-null object
ink temperature        539 non-null object
humidity               539 non-null object
roughness              539 non-null object
blade pressure         539 non-null object
varnish pct            539 non-null object
press speed            539 non-null object
ink pct                539 non-null object
solvent pct            539 non-null object
ESA Voltage            539 non-null object
ESA Amperage           539 non-null object
wax                    539 non-null object
hardener               539 non-null object
roller durometer       539 non-null object
current density        539 non-null object
anode space ratio      539 non-null object
chrome content         539 non-null object
band type              539 non-null object
dtypes: float64(1), object(39)
memory usage: 169.1+ KB

In [31]:
dt_attributes = [
    'timestamp',
]

cat_attributes = [
    'cylinder number',
    'customer',
    'job number',
    'grain screened',
    'ink color',
    'proof on ctd ink',
    'blade mfg',
    'cylinder divisional',
    'paper type',
    'ink type',
    'direct stream',
    'solvent type',
    'type on cylinder',
    'press type',
    'press',
    'unit number',
    'cylinder size',
    'paper mill location',
    'plating tank',
]

num_attributes = [
    'proof cut',
    'viscosity',
    'caliper',
    'ink temperature',
    'humidity',
    'roughness',
    'blade pressure',
    'varnish pct',
    'press speed',
    'ink pct',
    'solvent pct',
    'ESA Voltage',
    'ESA Amperage',
    'wax',
    'hardener',
    'roller durometer',
    'current density',
    'anode space ratio',
    'chrome content',
]

In [32]:
def contextual_convert(df, dt_attributes=dt_attributes, num_attributes=num_attributes, cat_attributes=cat_attributes):
    attributes = dt_attributes + num_attributes + cat_attributes
    for att in attributes:
        if att in dt_attributes:
            df[att] = pd.to_datetime(df[att], format="%Y%m%d", errors='coerce')
        elif att in num_attributes:
            df[att] = pd.to_numeric(df[att], errors='coerce')
        elif att in cat_attributes:
            df[att] = df[att].astype('category')
        else:
            pass

In [33]:
contextual_convert(cylinder_bands)

In [34]:
cylinder_bands[dt_attributes].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 1 columns):
timestamp    540 non-null datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 4.3 KB

In [35]:
cylinder_bands[cat_attributes].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 19 columns):
cylinder number        541 non-null category
customer               541 non-null category
job number             541 non-null category
grain screened         541 non-null category
ink color              541 non-null category
proof on ctd ink       541 non-null category
blade mfg              541 non-null category
cylinder divisional    541 non-null category
paper type             541 non-null category
ink type               541 non-null category
direct stream          541 non-null category
solvent type           541 non-null category
type on cylinder       541 non-null category
press type             541 non-null category
press                  541 non-null category
unit number            541 non-null category
cylinder size          540 non-null category
paper mill location    540 non-null category
plating tank           540 non-null category
dtypes: category(19)
memory usage: 53.7 KB

In [36]:
cylinder_bands[num_attributes].info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 19 columns):
proof cut            486 non-null float64
viscosity            535 non-null float64
caliper              512 non-null float64
ink temperature      537 non-null float64
humidity             538 non-null float64
roughness            509 non-null float64
blade pressure       476 non-null float64
varnish pct          484 non-null float64
press speed          529 non-null float64
ink pct              484 non-null float64
solvent pct          484 non-null float64
ESA Voltage          483 non-null float64
ESA Amperage         485 non-null float64
wax                  533 non-null float64
hardener             532 non-null float64
roller durometer     485 non-null float64
current density      532 non-null float64
anode space ratio    532 non-null float64
chrome content       536 non-null float64
dtypes: float64(19)
memory usage: 80.4 KB

In [37]:
cylinder_bands['customer'].str.upper()


Out[37]:
0            TVGUIDE
1            TVGUIDE
2             MODMAT
3             MASSEY
4              KMART
5             MASSEY
6              ROSES
7              ROSES
8             MODMAT
9         CHILDCRAFT
10        CHILDCRAFT
11       HANOVRHOUSE
12        HANOVRHOUS
13        GUIDEPOSTS
14        HANOVRHOUS
15            MODMAT
16          HOMESHOP
17          HOMESHOP
18             USCAV
19             USCAV
20          COLORTIL
21          COLORTIL
22             WARDS
23            TARGET
24             KMART
25             KMART
26             KMART
27             KMART
28         WOOLWORTH
29              AMES
           ...      
511          VENTURE
512         HOMESHOP
513         HOMESHOP
514           MODMAT
515           TARGET
516              JFK
517    COLORFULIMAGE
518         BESTPROD
519            KMART
520          TOYSRUS
521          ECKERDS
522             1910
523           MODMAT
524            KMART
525            KMART
526           MODMAT
527          TOYSRUS
528         HOMESHOP
529          TOYSRUS
530              JCP
531          WALMART
532            KMART
533           ECKERD
534           ECKERD
535            KMART
536            KMART
537             BEST
538            KMART
539            KMART
540             BEST
Name: customer, Length: 541, dtype: object

In [38]:
cylinder_bands['customer'].value_counts()


Out[38]:
MODMAT            61
KMART             54
TARGET            40
TVGUIDE           38
WARDS             33
AMES              24
TOYSRUS           23
ROSES             22
kmart             13
BESTPROD           9
SHEPLERS           9
ECKERD             9
WOOLWORTH          9
CHILDCRAFT         9
WALMART            9
AUSTADS            8
HILLS              8
MASSEY             7
ECKERDS            7
DOWNS              7
GUIDEPOSTS         6
USCAV              6
REI                5
COLORTILE          5
HANOVERHOUSE       4
BELK               4
homeshop           4
DUNNS              4
SEARS              4
ABBEY              4
                  ..
HOMESHOP           2
HOMESHOPPING       2
WOOLWRTH           2
CAMPINGWORLD       2
JCPENNY            2
BURDINES           2
SERVMERCH          2
LAZARUS            2
OLDVILLAGESHOP     2
BRENDLS            2
COLORTIL           2
ADCO               1
walmart            1
GALLS              1
colorfulimage      1
target             1
jfk                1
jcp                1
homeshopping       1
hanoverhouse       1
eckerds            1
YIELDHOUSE         1
GLOBAL             1
PENNEY             1
NTLWILDLIFE        1
KIDSRUS            1
venture            1
HANOVRHOUSE        1
GLOBALEQUP         1
1910               1
Name: customer, Length: 84, dtype: int64

In [53]:
def capitalize_columns(df, cols):
    for col in cols:
        df[col] = df[col].str.upper()

In [39]:
cylinder_bands['customer'] = cylinder_bands['customer'].str.upper()

In [54]:
capitalize_columns(cylinder_bands, ['customer', 'cylinder number'])

Preprocessing Pipelines


In [58]:
from category_encoders import OrdinalEncoder, OneHotEncoder

In [ ]:
from sklearn.base import BaseEstimator, TransformerMixin

# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

Scratch Work


In [65]:
cylinder_bands['cylinder size'].value_counts()


Out[65]:
TABLOID    281
CATALOG    164
SPIEGEL     54
tabloid     17
catalog     14
spiegel      6
?            3
0.7          1
Name: cylinder size, dtype: int64

In [74]:
cylinder_bands['plating tank'].value_counts()


Out[74]:
1910    331
1911    190
?        18
40        1
Name: plating tank, dtype: int64

In [67]:
cylinder_bands[cylinder_bands['cylinder size'] == "0.7"]


Out[67]:
timestamp cylinder number customer job number grain screened ink color proof on ctd ink blade mfg cylinder divisional paper type ... solvent pct ESA Voltage ESA Amperage wax hardener roller durometer current density anode space ratio chrome content band type
522 NaT ? 1910 ? 45 0.200 17 84 0.8125 27 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

1 rows × 40 columns


In [68]:
raw_cylinder_bands = load_data(filename)

In [70]:
raw_cylinder_bands.iloc[522]


Out[70]:
0     tabloid
1           ?
2        1910
3           ?
4          45
5       0.200
6          17
7          84
8      0.8125
9          27
10          ?
11       1865
12          ?
13          ?
14          ?
15          ?
16        1.5
17        0.7
18          ?
19         40
20      109.1
21         95
22       band
23        NaN
24        NaN
25        NaN
26        NaN
27        NaN
28        NaN
29        NaN
30        NaN
31        NaN
32        NaN
33        NaN
34        NaN
35        NaN
36        NaN
37        NaN
38        NaN
39        NaN
Name: 522, dtype: object